************************************************
* Title: rwanda_jde_tableA9.do
* Author: Todd Pugatch
* Last update: June 10 2024
* Description: analysis for Blimpo and Pugatch, "Entrepreneurship Education
*	and Teacher Training in Rwanda," Stage 2 Registered report, Journal of 
*	Development Economics
* Inputs: 	teacher_merge_jde.dta
*			student_merge_jde.dta
*			rwanda_school_trainattend_jde.dta
* Outputs: 	rwanda_jde_tableA9.[txt/out]
* Notes: produces Table A9
*************************************************

local start=`"$S_TIME"'
clear
clear matrix
clear mata
graph drop _all
program drop _all
cap log close
set more off

* Set directories 
*global main "[SET MAIN DIRECTORY HERE]"
	global rawdata "$main/01_data/01_raw"
	global cleandata "$main/01_data/02_clean"
	global dofiles "$main/02_dofiles"
	global results "$main/03_results"
	global temp "$main/04_output"

* begin log file
log using "$temp/rwanda_jde_tableA9.txt", text replace

* data prep: merge take-up with teacher & student surveys, by school
local takeup "training_any_pct exchange_pct"
qui use "$cleandata/teacher_merge_jde.dta", clear
qui merge m:1 school_code using "$cleandata/rwanda_school_trainattend_jde.dta", keepusing(`takeup')
qui drop if _merge==2
drop _merge
qui save "$temp/teachertemp.dta", replace
	
qui use "$cleandata/student_merge_jde.dta", clear
qui merge m:1 school_code using "$cleandata/rwanda_school_trainattend_jde.dta", keepusing(`takeup')
qui drop if _merge==2
drop _merge
qui save "$temp/studenttemp.dta", replace	

****************************
* SET UP MULTIPLE HYPOTHESIS CORRECTION
/*Proceed in 3 steps:
	1. Run all regressions for H1. Save coefficients & std. errors.
	2. Calculate q-values for all hypotheses.
	3. Re-run regressions to get output, including q-values.*/
****************************

* 1. GET COEF/SE/P FOR ALL REGRESSIONS	
* ACTIVE INSTRUCTION TIME: teacher observation data
/*Goal: produce columns 1-6 of Table P3
	Uses endline teacher classroom observations (Stallings tool).
	Regress active instruction time on treatment status, controlling
		for randomization strata.
	Note that baseline data (BTQ300-302) only available for subset of teachers.*/
qui use "$temp/teachertemp.dta", clear

* restrict to teachers with classes observed at endline
qui keep if insample_obs_el==1

* prep missing values for teachers without baseline outcome
qui su pedagogy_active_index_bl if treatment==0 & insample_bl==1
qui replace pedagogy_active_index_bl=r(mean) if insample_bl!=1
qui gen pedagogy_active_index_blm=(insample_bl!=1)
lab var pedagogy_active_index_blm "missing value for `y0'"

* regressions: Table P3, columns 1-6
local Y "mainact_teacher_active_pct	mainact_teacher_active_0_27	mainact_teacher_active_28_52"
local Y0 "pedagogy_active_index_bl pedagogy_active_index_blm"

/*all observed classes: cols 1-3*/
local i=1
foreach y in `Y' {
	qui ivregress 2sls `y' (exchange_pct=treatment) `Y0' i.strata, robust
	scalar define H2_c`i'b=_b[exchange_pct]
	scalar define H2_c`i'se=_se[exchange_pct]
	scalar define t=_b[exchange_pct]/_se[exchange_pct]
	scalar define dfr=e(N)-e(df_m)
	scalar define H2_c`i'p=2*ttail(dfr,abs(t))
	local i=`i'+1
}

/*Skills Lab only: cols 4-6*/
foreach y in `Y' {
	qui ivregress 2sls `y' (exchange_pct=treatment) `Y0' i.strata if skillslab_class_el==1, robust
	scalar define H2_c`i'b=_b[exchange_pct]
	scalar define H2_c`i'se=_se[exchange_pct]
	scalar define t=_b[exchange_pct]/_se[exchange_pct]
	scalar define dfr=e(N)-e(df_m)
	scalar define H2_c`i'p=2*ttail(dfr,abs(t))
	local i=`i'+1
}

/*how many Skills Labs in treatment v. control?*/
tab skillslab_class_el treatment, mi		
		
* ACTIVE INSTRUCTION TECHNIQUES: teacher observation and student endline data
/*Goal: produce columns 7-9 of Table P3
	Uses endline teacher classroom observations (Stallings tool).
	Regress active instruction techniques on exchange visit attendance (%),
		instumented by treatment status. Control for randomization strata.
	Note that baseline data not applicable here.
	For data using student outcomes, cluster outcomes by school.*/

qui use "$temp/teachertemp.dta", clear

* restrict to teachers with classes observed at endline
qui keep if insample_obs_el==1

* regressions: Table P3, columns 7-8 (teacher observation data)
/*column 7: all classes*/
qui ivregress 2sls method_curric_pct_el (exchange_pct=treatment) i.strata, robust
scalar define H2_c`i'b=_b[exchange_pct]
scalar define H2_c`i'se=_se[exchange_pct]
scalar define t=_b[exchange_pct]/_se[exchange_pct]
scalar define dfr=e(N)-e(df_m)
scalar define H2_c`i'p=2*ttail(dfr,abs(t))
local i=`i'+1

/*column 8: Skills Labs only*/
qui ivregress 2sls method_curric_pct_el (exchange_pct=treatment) i.strata if skillslab_class_el==1, robust
scalar define H2_c`i'b=_b[exchange_pct]
scalar define H2_c`i'se=_se[exchange_pct]
scalar define t=_b[exchange_pct]/_se[exchange_pct]
scalar define dfr=e(N)-e(df_m)
scalar define H2_c`i'p=2*ttail(dfr,abs(t))
local i=`i'+1

* regressions: Table P3, column 9 (student endline)
/*Note that the dependent variable was defined inconsistently in two places of the Stage 1 Registered 
	Report. Therefore, include both "narrow" measure (consistent with table of hypotheses 
	[Table 1, ESQ600-603]) and "broad" measure (consistent with Table P3, ESQ600-611).*/
qui use "$temp/studenttemp.dta", clear
local Y "active_instruct_narrow_el active_instruct_broad_el"

foreach y in `Y' {
	qui ivregress 2sls `y' (exchange_pct=treatment) i.strata, cluster(school_code)
	scalar define H2_c`i'b=_b[exchange_pct]
	scalar define H2_c`i'se=_se[exchange_pct]
	scalar define t=_b[exchange_pct]/_se[exchange_pct]
	scalar define dfr=e(N)-e(df_m)
	scalar define H2_c`i'p=2*ttail(dfr,abs(t))
	local i=`i'+1
}

* 2. CALCULATE Q-VALUES FOR ALL HYPOTHESES
/*Calculate sharpened q-values from Benjamini, Krieger, Yuketeli (2006).
	Use code from fdr_sharpened_qvalues.do from Anderson (2008), available at 
		https://are.berkeley.edu/~mlanderson/ARE_Website/Research.html*/
* prep data: put p-values from above into a dataset
clear
local n=`i'-1
qui set obs `n'
qui gen hypothesis=2
qui gen column=_n
foreach x in b se pval {
	qui gen double `x'=.
}
forval i=1/`n' {
	qui replace b=H2_c`i'b in `i'
	qui replace se=H2_c`i'se in `i'
	qui replace pval=H2_c`i'p in `i' 
}

* START OF ANDERSON CODE FROM fdr_sharpened_qvalues.do (modified)
* Collect the total number of p-values tested
quietly sum pval
local totalpvals = r(N)

* Sort the p-values in ascending order and generate a variable that codes each p-value's rank

quietly gen int original_sorting_order = _n
quietly sort pval
quietly gen int rank = _n if pval~=.

* Generate the variable that will contain the BKY (2006) sharpened q-values

qui gen bky06_qval = 1 if pval~=.
	
* Set the initial counter to 1 

local qval = 1

/* Set up a loop that begins by checking which hypotheses are rejected at q = 1.000, 
	then checks which hypotheses are rejected at q = 0.999, 
	then checks which hypotheses are rejected at q = 0.998, etc.  
	The loop ends by checking which hypotheses are rejected at q = 0.001.*/
while `qval' > 0 {
	* First Stage
	* Generate the adjusted first stage q level we are testing: q' = q/1+q
	local qval_adj = `qval'/(1+`qval')
	
	* Generate value q'*r/M
	qui gen fdr_temp1 = `qval_adj'*rank/`totalpvals'
	
	* Generate binary variable checking condition p(r) <= q'*r/M
	qui gen reject_temp1 = (fdr_temp1>=pval) if pval~=.
	
	* Generate variable containing p-value ranks for all p-values that meet above condition
	qui gen reject_rank1 = reject_temp1*rank
	
	* Record the rank of the largest p-value that meets above condition
	qui egen total_rejected1 = max(reject_rank1)

	* Second Stage
	* Generate the second stage q level that accounts for hypotheses rejected in first stage: q_2st = q'*(M/m0)
	local qval_2st = `qval_adj'*(`totalpvals'/(`totalpvals'-total_rejected1[1]))
	
	* Generate value q_2st*r/M
	qui gen fdr_temp2 = `qval_2st'*rank/`totalpvals'
	
	* Generate binary variable checking condition p(r) <= q_2st*r/M
	qui gen reject_temp2 = (fdr_temp2>=pval) if pval~=.
	
	* Generate variable containing p-value ranks for all p-values that meet above condition
	qui gen reject_rank2 = reject_temp2*rank
	
	* Record the rank of the largest p-value that meets above condition
	qui egen total_rejected2 = max(reject_rank2)

	* A p-value has been rejected at level q if its rank is less than or equal to the rank of the max p-value that meets the above condition
	qui replace bky06_qval = `qval' if rank <= total_rejected2 & rank~=.
	
	* Reduce q by 0.001 and repeat loop
	drop fdr_temp* reject_temp* reject_rank* total_rejected*
	local qval = `qval' - .001
}

quietly sort original_sorting_order

******************************
* END OF ANDERSON CODE FROM fdr_sharpened_qvalues.do (modified)
******************************
* list results
list hypothesis column b se pval bky06_qval

* prep results for inclusion in final regression output
forval i=1/`n' {
	scalar define H2_c`i'q=bky06_qval in `i'
}
		
* 3. RE-RUN REGRESSIONS TO GET OUTPUT, INCLUDING Q-VALUES
/*Goal: produce columns 1-6 of Table P3
	Uses endline teacher classroom observations (Stallings tool).
	Regress active instruction time on treatment status, controlling
		for randomization strata.
	Include sharpened q-values calculated above.
	Note that baseline data (BTQ300-302) only available for subset of teachers.*/		

* ACTIVE INSTRUCTION TIME: teacher observation data
/*Goal: produce columns 1-6 of Table P3
	Uses endline teacher classroom observations (Stallings tool).
	Regress active instruction time on exchange visit attendance (%),
		instumented by treatment status. Control for baseline outcome and 
		randomization strata.
	Note that baseline data (BTQ300-302) only available for subset of teachers.*/

qui use "$temp/teachertemp.dta", clear

* restrict to teachers with classes observed at endline
qui keep if insample_obs_el==1

* prep missing values for teachers without baseline outcome
qui su pedagogy_active_index_bl if treatment==0 & insample_bl==1
qui replace pedagogy_active_index_bl=r(mean) if insample_bl!=1
qui gen pedagogy_active_index_blm=(insample_bl!=1)
lab var pedagogy_active_index_blm "missing value for `y0'"

* regressions: Table P3, columns 1-6
local stat ""1st stage F",F,"control mean",mu_c,"baseline mean",mu_0,"q-value",qv"
local Y "mainact_teacher_active_0_27	mainact_teacher_active_28_52"
local Y0 "pedagogy_active_index_bl pedagogy_active_index_blm"

/*all observed classes: cols 1-3*/
local i=1

* mainact_teacher_active_pct
	qui ivregress 2sls mainact_teacher_active_pct (exchange_pct=treatment) `Y0' i.strata, robust
	qui estat first
	matrix define X=r(singleresults)
	scalar define F=X[1,4]
	qui su mainact_teacher_active_pct if treatment==0
	scalar define mu_c=r(mean)
	qui su pedagogy_active_index_bl if insample_bl==1
	scalar define mu_0=r(mean)
	scalar define qv=H2_c`i'q
	outreg2 exchange_pct using "$results/rwanda_jde_tableA9.xls", se excel nolabel nocons addstat(`stat') replace
	local i=`i'+1

foreach y in `Y' {
	qui ivregress 2sls `y' (exchange_pct=treatment) `Y0' i.strata, robust
	qui estat first
	matrix define X=r(singleresults)
	scalar define F=X[1,4]
	qui su `y' if treatment==0
	scalar define mu_c=r(mean)
	qui su pedagogy_active_index_bl if insample_bl==1
	scalar define mu_0=r(mean)
	scalar define qv=H2_c`i'q
	outreg2 exchange_pct using "$results/rwanda_jde_tableA9.xls", se excel nolabel nocons addstat(`stat') append
	local i=`i'+1
}

/*Skills Lab only: cols 4-6*/
foreach y in `Y' {
	qui ivregress 2sls `y' (exchange_pct=treatment) `Y0' i.strata if skillslab_class_el==1, robust
	qui estat first
	matrix define X=r(singleresults)
	scalar define F=X[1,4]
	qui su `y' if treatment==0 & skillslab_class_el==1
	scalar define mu_c=r(mean)
	qui su pedagogy_active_index_bl if insample_bl==1 & skillslab_class_el==1
	scalar define mu_0=r(mean)
	scalar define qv=H2_c`i'q
	outreg2 exchange_pct using "$results/rwanda_jde_tableA9.xls", se excel nolabel nocons addstat(`stat') append
	local i=`i'+1
}

/*how many Skills Labs in treatment v. control?*/
tab skillslab_class_el treatment, mi

* ACTIVE INSTRUCTION TECHNIQUES: teacher observation and student endline data
/*Goal: produce columns 7-9 of Table P3
	Uses endline teacher classroom observations (Stallings tool).
	Regress active instruction techniques on exchange visit attendance (%),
		instumented by treatment status. Control for randomization strata.
	Note that baseline data not applicable here.
	For data using student outcomes, cluster outcomes by school.*/

qui use "$temp/teachertemp.dta", clear

* restrict to teachers with classes observed at endline
qui keep if insample_obs_el==1

* regressions: Table P3, columns 7-8 (teacher observation data)
local stat ""1st stage F",F,"control mean",mu_c,"q-value",qv"

/*column 7: all classes*/
qui ivregress 2sls method_curric_pct_el (exchange_pct=treatment) i.strata, robust
qui estat first
matrix define X=r(singleresults)
scalar define F=X[1,4]
qui su method_curric_pct_el if treatment==0
scalar define mu_c=r(mean)
scalar define qv=H2_c`i'q
outreg2 exchange_pct using "$results/rwanda_jde_tableA9.xls", se excel nolabel nocons addstat(`stat') append
local i=`i'+1

/*column 8: Skills Labs only*/
qui ivregress 2sls method_curric_pct_el (exchange_pct=treatment) i.strata if skillslab_class_el==1, robust
qui estat first
matrix define X=r(singleresults)
scalar define F=X[1,4]
qui su method_curric_pct_el if treatment==0
scalar define mu_c=r(mean)
scalar define qv=H2_c`i'q
outreg2 exchange_pct using "$results/rwanda_jde_tableA9.xls", se excel nolabel nocons addstat(`stat') append
local i=`i'+1

* regressions: Table P3, column 9 (student endline)
/*Note that the dependent variable was defined inconsistently in two places of the Stage 1 Registered 
	Report. Therefore, include both "narrow" measure (consistent with table of hypotheses 
	[Table 1, ESQ600-603]) and "broad" measure (consistent with Table P3, ESQ600-611).*/

qui use "$temp/studenttemp.dta", clear
local Y "active_instruct_narrow_el active_instruct_broad_el"

foreach y in `Y' {
	qui ivregress 2sls `y' (exchange_pct=treatment) i.strata, cluster(school_code)
	qui estat first
	matrix define X=r(singleresults)
	scalar define F=X[1,4]
	qui su `y' if treatment==0
	scalar define mu_c=r(mean)
	scalar define qv=H2_c`i'q
	outreg2 exchange_pct using "$results/rwanda_jde_tableA9.xls", se excel nolabel nocons addstat(`stat') append
	local i=`i'+1
}


erase "$temp/teachertemp.dta"
erase "$temp/studenttemp.dta"
local end=`"$S_TIME"' 
di "`start'"
di "`end'"
log close

